First, let’s load the Power 2011 region database. This will be used as an “atlas” throughout, to guide the development of the regions.
power2011 <- read_csv("../bin/power_2011.csv",
col_types = cols(ROI=col_double(),
X = col_double(),
Y = col_double(),
Z = col_double(),
Network = col_double(),
Color = col_character(),
NetworkName = col_character())) %>%
dplyr::select(ROI, X, Y, Z, Network, Color, NetworkName)
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9], 'X10' [10],
## 'X11' [11], 'X12' [12], 'X13' [13], 'X14' [14], 'X15' [15], 'X16' [16],
## 'X17' [17]
We now need to load the group level data. In essence, to corresponds to create a matrix X in which every individual is a row and every columns is a different ROI-to-ROI connection.
NOFLY <- c()
SUBJS <- c()
cols <- outer(power2011$ROI, power2011$ROI, function(x, y) {paste(x, y, sep="-")})
cols %<>% as.vector
connection <- function(x, y) {
paste(min(x, y), max(x, y), sep="-")
}
vconnection <- Vectorize(connection)
Mode <- function(x, na.rm=F) {
if (na.rm) {
x = x[!is.na(x)]
}
ux <- unique(x)
return(ux[which.max(tabulate(match(x, ux)))])
}
reduced_power2011 <- power2011 %>%
dplyr::select(Network, NetworkName) %>%
group_by(Network) %>%
summarize(Network = mean(Network), NetworkName = Mode(NetworkName))
connection_name <- function(x, y) {
first <- min(x, y)
second <- max(x, y)
paste(reduced_power2011 %>% filter(Network == first) %>% dplyr::select(NetworkName) ,
reduced_power2011 %>% filter(Network == second) %>% dplyr::select(NetworkName),
sep="-")
}
vconnection_name <- Vectorize(connection_name)
connection_name2 <- function(x, y) {
first <- min(x, y)
second <- max(x, y)
paste(reduced_power2011$NetworkName[reduced_power2011$Network == first],
reduced_power2011$NetworkName[reduced_power2011$Network == second],
sep="-")
}
vconnection_name2 <- Vectorize(connection_name2)
nets <- outer(power2011$Network, power2011$Network, vconnection)
nets %<>% as.vector
netnames <- outer(power2011$Network, power2011$Network, vconnection_name2)
netnames %<>% as.vector
n <- length(grep("sub-*", dir("../rsfmri/REST1")))
C <- matrix(data = rep(0, length(cols)*n), nrow = n)
j <- 1
R <- NULL
PR <- NULL
for (sub in dir("../rsfmri/REST1")[grep("sub-*", dir("../rsfmri/REST1"))]) {
SUBJS %<>% c(strsplit(sub, "-")[[1]][2])
M <- paste("../rsfmri/REST1",
sub,
"ses-01/mr_pcorr.txt", sep="/") %>%
read_csv(skip = 1,
col_names = F,
col_types = cols(
.default = col_double(),
X1 = col_character()
)) %>%
as.matrix()
v <- as_vector(M[,2:265]) # v spreads M column-wise. M is symmetrical, so it should not matter, but better not risk it
C[j,] <- v
if (length(v[is.na(v)]) > 0) {
print(paste("NA detected in sub", sub))
NOFLY %<>% c(sub) # Addes sub to NOFLY list
}
j <- j + 1
}
If we want, we can restrict the analysis only to a limited set of networks (and their cross-network connections) by modifying the NOI (Networks of Interest) variable. The variable will be used to create a second list, COI (Connections of interest), which will contain the possible list of network-to-network connections for the networks in NOI. (This is currently not needed, since the \(X\) matrix is already restricted to meaningful connections).
NOI <- c(
"Uncertain",
"Sensory/somatomotor Hand",
"Sensory/somatomotor Mouth",
"Cingulo-opercular Task Control",
"Auditory",
"Default mode",
"Memory retrieval?",
"Ventral attention",
"Visual",
"Fronto-parietal Task Control",
"Salience",
"Subcortical",
"Cerebellar",
"Dorsal attention"
)
COI <- outer(NOI,
NOI,
function(x, y) {paste(x, y, sep="-")}) %>% as.vector()
Now, we need to remove some columns from the hyper-large X matrix, and define proper groupings for Lasso.
First, we ensure that the data in the connectivity matrix C is actually numeric.
C <- apply(C, 2, FUN=as.numeric)
Then, we create a set of “censor” vectors, each of each in a binary vector that has the same lenght as the columns of C. If the j-th element of the censor vector is TRUE, the corresponding column in C is kept, otherwise it is removed from the possible regressors.
The first censor vector simply removes the redundant columns (since the connectivity from A to B is the same as the connectivity of B to A) and the self-correlations:
censor <- outer(power2011$ROI,
power2011$ROI,
function(x, y) {x < y}) %>% as.vector()
The second censor vector removes unlikely functional connections: Those with a partial correlation value \(|r| < 0.05|\).
censor2 <- colMeans(C) %>% abs() > 0.05
Now, we combine the censor vectors in a tibble that contains all of the relevant information about each column in C.
order <- tibble(index = 1:length(nets),
network = nets,
network_names = netnames,
connection = cols,
censor=censor,
censor2 = censor2)
order %<>% arrange(network)
And we remove all entries for each a censor vector is FALSE (we also create a grouping factor G, in case in the future we want to use Group Lasso)
I <- order %>%
filter(censor == TRUE) %>%
filter(censor2 == TRUE) %>%
filter(network_names %in% COI) %>%
dplyr::select(index)
G <- order %>%
filter(censor == TRUE) %>%
filter(network_names %in% COI) %>%
dplyr::select(network)
# G is the real grouping factor for Lasso!
As a last step, we create the “real” regressor matrix \(X\), which is the proper subset of \(C\) after removing all of the censored columns.
X <- C[,as_vector(I)]
Now we need to load the dependent variable. In this case, it is a binary variable that determines which strategy model best fits the behavioral data of an individual, whether it is the “memory” strategy (\(Y = 1\)) or the “procedural” strategy (\(Y = 2\)).
dvs <- read_csv("../actr-models/model_output/MODELLogLikelihood.csv",
col_types = cols(
.default = col_double(),
HCPID = col_character(),
best_model = col_character()
)) %>%
dplyr::select(HCPID, best_model) %>%
arrange(HCPID)
## Warning: Missing column names filled in: 'X1' [1]
Now we select only the rows of \(X\) and the values of \(Y\) for which we have both rsfMRI and model data:
subjs_hcp <- paste(SUBJS, "fnca", sep="_")
common <- intersect(subjs_hcp, dvs$HCPID)
keep_X <- subjs_hcp %in% common
keep_Y <- dvs$HCPID %in% common
Y <- dvs$best_model[keep_Y]
X <- X[keep_X, ]
Finally, we transform the dependent variable \(Y\) into a binary numeric variable with values \((0, 1)\), so that we can use logistic regression.
Y <- as.numeric(as.factor(Y)) - 1
Let’s do some visualization and analysis of our indepedenent and dependet variables, just to ensure there are no obvious problems.
The regressors \(X\) is certainly multi-collinear; that is a consequence of having a large number of predictors \(p > n\), which, in turn, is one of the reasons why we are using Lasso. Too much collinearity, however, could be really bad and push Lasso towards selecting non-optimal regressors. To gather a sense of how much collinearity we have, we can plot the distribution of correlations among regressors:
corX <- cor(X)
distCor <- as_vector(corX[lower.tri(corX, diag = F)])
distTibble <- as_tibble(data.frame(R=distCor))
ggplot(distTibble, aes(x=R)) +
geom_histogram(col="white", alpha=0.5, binwidth = 0.05) +
theme_pander() +
ylab("Number of Correlations") +
xlab("Correlation Value") +
ggtitle("Distribution of Correlation Values Between Regressors")
All in all, the collinearity is not that bad—all regressors are correlated at \(|r| < 0.25\), and most of them are correlated at \(|r| < 0.1\), with a peak at \(r = 0\).
And now, let’s visualize the histogram of the dependent variable we are trying to predict:
dependent <- as_tibble(data.frame(strategy=Y))
ggplot(dependent, aes(x = factor(strategy), fill=as.factor(strategy))) +
geom_bar(col="white", alpha=0.5, width = .5) +
scale_fill_aaas() +
xlab("Strategy") +
ylab("Number of Participants") +
ggtitle("Distribution of Strategy Use") +
theme_pander() +
theme(legend.position = "none")
Because the classes are not equally distributed, and participants are more likely to use the memory strategy (\(Y=0\)) than the procedural one (\(Y = 1\)), we would need to adjust the weights of our Lasso model.
To analyzie the data, we will use Lasso, a statitical learning system based on penalyzed regression.
Most of the entries in our \(Y\) vector are coded as “0” (i.e., most poarticipants use the memory strategy), which creates an imbalance. We are going to create an appropriate set of weights so that the two classes are balanced.
W <- Y
W[W == 0] <- mean(Y)
W[W == 1] <- (1-mean(Y))
We can now define the lasso model. We will use the elastic net approach with \(\alpha = 0\) to generate a pure lasso model. The model will use a binomial (i.e., logistic) regression and will measure the cross-validation error as class misassignment.
fit <- glmnet(y = Y,
x = X,
alpha=1,
weights = W,
family = "binomial",
type.measure = "class",
standardize = T
)
To choose the optimal value of \(\lambda\) in Lasso, we will examine the cross-validation error during a LOO cross-validation.
fit.cv <- cv.glmnet(y = Y,
x = X,
alpha=1,
family = "binomial",
weights = W,
type.measure = "class",
standardize=T,
nfolds=length(Y),
grouped = F
)
Now, let’s look at the cross-validation error profile.
plot(fit.cv)
The profile has the characteristic U-shape or increasing curve, with more error as \(\lambda\) increases. As recommended by Tibishirani, we will select the “lambda 1SE” value, which is the largest \(\lambda\) value that does not differ by more tha 1 SE from the \(\lambda\) value that gives us the minimum cross validation error. This guarantees the maximum generalizability.
We can also visualize the profile of the beta weights of each connection for different values of \(\lambda\).
plot(fit, sub="Beta Values for Connectivity")
L1norm <- sum(abs(fit$beta[,which(fit$lambda==fit.cv$lambda.1se)]))
abline(v=L1norm, lwd=2, lty=2)
And now, plot prettier version
lasso_df <- as_tibble(data.frame(lambda=fit.cv$lambda,
error=fit.cv$cvm,
sd=fit.cv$cvsd))
ggplot(lasso_df, aes(x=lambda, y=error)) +
geom_line(aes(col=error), lwd=2) +
scale_color_viridis("Error", option = "plasma") +
geom_ribbon(aes(ymin=error -sd, ymax=error + sd), alpha=0.2,fill="blue") +
xlab(expression(lambda)) +
ylab("Cross-Validation Error") +
ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
geom_vline(xintercept = lasso_df$lambda[lasso_df$error==min(lasso_df$error)]) +
theme_pander() +
theme(legend.position="right")
The min \(\lambda_{min}\) is 0.0313777, The 1se min \(\lambda_{min}\) is 0.0313777
Let’s have a better look at the relevant connections that survive the Lass penalty at the value of \(\lambda_{min} + 1 SE\). We start by saving these connections in a tibble, and saving the data on a file for later use.
betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.1se)]
conn_betas <- as_tibble(data.frame(index=I$index, Beta=betas))
connectome <- order %>%
filter(index %in% I$index) %>%
inner_join(conn_betas) %>%
dplyr::select(-censor2) %>%
filter(Beta != 0) %>%
arrange(index)
## Joining, by = "index"
write_csv(connectome, file="strategy_mr.csv")
save(fit, fit.cv, X, Y, order, I, G, file="strategy_mr.RData")
Finally, we can visualize the table of connections
connectome %>%
xtable() %>%
kable(digits = 5) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| index | network | network_names | connection | censor | Beta |
|---|---|---|---|---|---|
| 8206 | 1-1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 22-32 | TRUE | -1.89086 |
| 8207 | 1-1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 23-32 | TRUE | 2.30505 |
| 9264 | 1-1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 24-36 | TRUE | -2.03310 |
| 10063 | 1-1 | Sensory/somatomotor Hand-Sensory/somatomotor Hand | 31-39 | TRUE | 2.81201 |
| 16959 | 4-4 | Auditory-Auditory | 63-65 | TRUE | 0.53776 |
| 18017 | 4-4 | Auditory-Auditory | 65-69 | TRUE | 4.03346 |
| 18020 | 4-4 | Auditory-Auditory | 68-69 | TRUE | -0.26776 |
| 18261 | 2-4 | Sensory/somatomotor Mouth-Auditory | 45-70 | TRUE | 2.48728 |
| 18274 | 3-4 | Cingulo-opercular Task Control-Auditory | 58-70 | TRUE | 0.04146 |
| 18282 | 4-4 | Auditory-Auditory | 66-70 | TRUE | -0.11546 |
| 21995 | -1-5 | Uncertain-Default mode | 83-84 | TRUE | 0.68047 |
| 24368 | 5-5 | Default mode-Default mode | 80-93 | TRUE | 1.69981 |
| 24906 | 5-5 | Default mode-Default mode | 90-95 | TRUE | -3.74493 |
| 24907 | 5-5 | Default mode-Default mode | 91-95 | TRUE | -5.51874 |
| 26222 | 5-5 | Default mode-Default mode | 86-100 | TRUE | -0.04432 |
| 28059 | 5-5 | Default mode-Default mode | 75-107 | TRUE | -1.74162 |
| 28323 | 5-5 | Default mode-Default mode | 75-108 | TRUE | -2.93711 |
| 31420 | -1-5 | Uncertain-Default mode | 4-120 | TRUE | 0.37132 |
| 31534 | 5-5 | Default mode-Default mode | 118-120 | TRUE | 0.52838 |
| 32816 | 5-5 | Default mode-Default mode | 80-125 | TRUE | 4.46703 |
| 33124 | 5-5 | Default mode-Default mode | 124-126 | TRUE | -0.50586 |
| 34942 | 5-6 | Default mode-Memory retrieval? | 94-133 | TRUE | 1.66036 |
| 35205 | 5-6 | Default mode-Memory retrieval? | 93-134 | TRUE | -2.41717 |
| 35469 | 5-6 | Default mode-Memory retrieval? | 93-135 | TRUE | -1.37214 |
| 36007 | 5-5 | Default mode-Default mode | 103-137 | TRUE | -1.63355 |
| 37613 | 5-7 | Default mode-Visual | 125-143 | TRUE | 5.99955 |
| 40807 | 7-7 | Visual-Visual | 151-155 | TRUE | -2.08989 |
| 41075 | 7-7 | Visual-Visual | 155-156 | TRUE | -3.04218 |
| 41328 | 7-7 | Visual-Visual | 144-157 | TRUE | 0.52580 |
| 41864 | 7-7 | Visual-Visual | 152-159 | TRUE | 0.89705 |
| 42660 | 7-7 | Visual-Visual | 156-162 | TRUE | 1.58602 |
| 42927 | 7-7 | Visual-Visual | 159-163 | TRUE | -11.65413 |
| 43186 | 7-7 | Visual-Visual | 154-164 | TRUE | 2.06833 |
| 43458 | 7-7 | Visual-Visual | 162-165 | TRUE | 1.08723 |
| 44501 | 7-7 | Visual-Visual | 149-169 | TRUE | -0.83859 |
| 44514 | 7-7 | Visual-Visual | 162-169 | TRUE | -3.29812 |
| 45566 | 7-7 | Visual-Visual | 158-173 | TRUE | 1.09878 |
| 45810 | 8-11 | Fronto-parietal Task Control-Ventral attention | 138-174 | TRUE | 2.70591 |
| 47700 | 8-8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 180-181 | TRUE | -1.06966 |
| 48220 | -1-7 | Uncertain-Visual | 172-183 | TRUE | 0.97763 |
| 48759 | -1–1 | Uncertain-Uncertain | 183-185 | TRUE | 0.31034 |
| 49015 | 8-8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 175-186 | TRUE | -1.74089 |
| 50082 | 8-8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 186-190 | TRUE | -4.20237 |
| 51142 | 8-8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 190-194 | TRUE | -1.14423 |
| 52464 | 8-8 | Fronto-parietal Task Control-Fronto-parietal Task Control | 192-199 | TRUE | -2.95918 |
| 53356 | 1-9 | Sensory/somatomotor Hand-Salience | 28-203 | TRUE | 1.67191 |
| 53422 | 5-9 | Default mode-Salience | 94-203 | TRUE | 2.93233 |
| 54042 | 8-9 | Fronto-parietal Task Control-Salience | 186-205 | TRUE | -4.51978 |
| 54221 | 5-9 | Default mode-Salience | 101-206 | TRUE | -1.41648 |
| 55120 | 9-9 | Salience-Salience | 208-209 | TRUE | 6.91535 |
| 55500 | 3-9 | Cingulo-opercular Task Control-Salience | 60-211 | TRUE | 3.23332 |
| 55649 | 9-9 | Salience-Salience | 209-211 | TRUE | -1.12886 |
| 56708 | 9-9 | Salience-Salience | 212-215 | TRUE | -0.09215 |
| 57240 | 9-9 | Salience-Salience | 216-217 | TRUE | -0.91650 |
| 57770 | 9-9 | Salience-Salience | 218-219 | TRUE | -0.45811 |
| 61041 | 3-10 | Cingulo-opercular Task Control-Subcortical | 57-232 | TRUE | -0.28020 |
| 61745 | 10-10 | Subcortical-Subcortical | 233-234 | TRUE | -0.15582 |
| 62630 | 4-11 | Auditory-Ventral attention | 62-238 | TRUE | -2.21331 |
| 63756 | -1-11 | Uncertain-Ventral attention | 132-242 | TRUE | -0.75419 |
| 64071 | -1-13 | Uncertain-Cerebellar | 183-243 | TRUE | -4.11193 |
| 66780 | -1-12 | Uncertain-Dorsal attention | 252-253 | TRUE | 3.47463 |
| 67242 | 1-8 | Sensory/somatomotor Hand-Fronto-parietal Task Control | 186-255 | TRUE | -0.60621 |
| 67260 | 1-9 | Sensory/somatomotor Hand-Salience | 204-255 | TRUE | -0.26895 |
| 67836 | 12-12 | Dorsal attention-Dorsal attention | 252-257 | TRUE | -0.14686 |
| 67838 | -1-12 | Uncertain-Dorsal attention | 254-257 | TRUE | 0.00521 |
| 67873 | 1-12 | Sensory/somatomotor Hand-Dorsal attention | 25-258 | TRUE | 1.37378 |
| 68814 | 8-12 | Fronto-parietal Task Control-Dorsal attention | 174-261 | TRUE | -2.42184 |
| 69218 | 3-12 | Cingulo-opercular Task Control-Dorsal attention | 50-263 | TRUE | -2.01954 |
| 69461 | 1-12 | Sensory/somatomotor Hand-Dorsal attention | 29-264 | TRUE | 0.33782 |
And we can do some statistics. For example, which networks do these regions and connections belong to? Are they different from what would be expected from a random sample of connections from the connectome?
region_from <- function(s) {as.numeric(strsplit(s, "-")[[1]][1])}
region_to <- function(s) {as.numeric(strsplit(s, "-")[[1]][2])}
vregion_from <- Vectorize(region_from)
vregion_to <- Vectorize(region_to)
lROIs <- unique(c(vregion_from(connectome$connection),
vregion_to(connectome$connection)))
rois <- power2011[lROIs,]
roi_stats <- rois %>%
group_by(NetworkName, Color) %>%
summarise(N=length(Color)/length(lROIs)) %>%
add_column(Source="Lasso")
## `summarise()` has grouped output by 'NetworkName'. You can override using the `.groups` argument.
subsetPower <- filter(power2011,
NetworkName %in% NOI)
noi_stats <- subsetPower %>%
group_by(NetworkName, Color) %>%
summarise(N=length(Color)/dim(subsetPower)[1]) %>%
add_column(Source="Power")
## `summarise()` has grouped output by 'NetworkName'. You can override using the `.groups` argument.
total_stats <- rbind(roi_stats, noi_stats)
ggplot(total_stats, aes(x="", y=N, fill=NetworkName)) +
geom_bar(stat = "identity", col="white", width=1) +
facet_grid(~Source, labeller = label_both) +
coord_polar("y", start=0) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
#scale_fill_viridis(discrete = T) +
scale_fill_ucscgb() +
ylab("") +
xlab("") +
ggtitle("Distriution of Regions") +
geom_text_repel(aes(label=percent(N, .01)), col="black",
position=position_stack(vjust=1), size=3)+
theme_pander() +
guides(fill=guide_legend(ncol=2)) +
theme(legend.position = "bottom")
There is no difference in the distribution of ROIs:
chisq.test(roi_stats$N*length(lROIs), p = noi_stats$N)
## Warning in chisq.test(roi_stats$N * length(lROIs), p = noi_stats$N): Chi-squared
## approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: roi_stats$N * length(lROIs)
## X-squared = 13.267, df = 13, p-value = 0.4274
And now, let’s look at the sparsity
net_from <- function(s) {as.character(strsplit(s, "-")[[1]][1])}
net_to <- function(s) {as.character(strsplit(s, "-")[[1]][2])}
vnet_from <- Vectorize(net_from)
vnet_to <- Vectorize(net_to)
connectivity <- function(s) {
if (net_from(s) == net_to(s)) {
"Within"
} else {
"Between"
}
}
vconnectivity <- Vectorize(connectivity)
coi <- order %>%
filter(censor == TRUE) %>%
filter(network_names %in% COI)
coi$from <- vnet_from(coi$network_names)
coi$to <- vnet_to(coi$network_names)
coi$connection_type <- vconnectivity(coi$network_names)
coi_stats <- coi %>%
group_by(connection_type) %>%
summarise(N=length(index), P=length(index)/dim(coi)[1]) %>%
add_column(Source = "Power et al. (2011)")
connectome$connection_type <- vconnectivity(connectome$network_names)
connectome_stats <- connectome %>%
group_by(connection_type) %>%
summarise(N=length(index), P=length(index)/dim(connectome)[1]) %>%
add_column(Source = "Lasso Analysis")
total_stats2 <- rbind(connectome_stats, coi_stats)
ggplot(total_stats2, aes(x="", y=P, fill=connection_type)) +
geom_bar(stat = "identity", col="grey", width=1) +
facet_grid(~Source, labeller = label_both) +
coord_polar("y", start=0) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
scale_fill_jama() +
scale_color_jama() +
ylab("") +
xlab("") +
ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
geom_text_repel(aes(label=percent(P, .1)), col="white",
position=position_stack(vjust=1), size=3)+
theme_pander() +
theme(legend.position = "bottom")
The differenece in the two distributions is significant.
chisq.test(connectome_stats$N, p=coi_stats$P)
##
## Chi-squared test for given probabilities
##
## data: connectome_stats$N
## X-squared = 152.61, df = 1, p-value < 2.2e-16
Finally, we can look at how many connectionseach region belongs to:
rois <- c(vregion_from(connectome$connection), vregion_to(connectome$connection))
rois_t <- tibble(roi = rois)
rois_t %>%
group_by(roi) %>%
summarize(N = length(roi)) %>%
arrange(N) -> rois_dist
rois_dist %>%
xtable() %>%
kable(digits = 5) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| roi | N |
|---|---|
| 4 | 1 |
| 22 | 1 |
| 23 | 1 |
| 24 | 1 |
| 25 | 1 |
| 28 | 1 |
| 29 | 1 |
| 31 | 1 |
| 36 | 1 |
| 39 | 1 |
| 45 | 1 |
| 50 | 1 |
| 57 | 1 |
| 58 | 1 |
| 60 | 1 |
| 62 | 1 |
| 63 | 1 |
| 66 | 1 |
| 68 | 1 |
| 83 | 1 |
| 84 | 1 |
| 86 | 1 |
| 90 | 1 |
| 91 | 1 |
| 100 | 1 |
| 101 | 1 |
| 103 | 1 |
| 107 | 1 |
| 108 | 1 |
| 118 | 1 |
| 124 | 1 |
| 126 | 1 |
| 132 | 1 |
| 133 | 1 |
| 134 | 1 |
| 135 | 1 |
| 137 | 1 |
| 138 | 1 |
| 143 | 1 |
| 144 | 1 |
| 149 | 1 |
| 151 | 1 |
| 152 | 1 |
| 154 | 1 |
| 157 | 1 |
| 158 | 1 |
| 163 | 1 |
| 164 | 1 |
| 165 | 1 |
| 172 | 1 |
| 173 | 1 |
| 175 | 1 |
| 180 | 1 |
| 181 | 1 |
| 185 | 1 |
| 192 | 1 |
| 194 | 1 |
| 199 | 1 |
| 204 | 1 |
| 205 | 1 |
| 206 | 1 |
| 208 | 1 |
| 212 | 1 |
| 215 | 1 |
| 216 | 1 |
| 217 | 1 |
| 218 | 1 |
| 219 | 1 |
| 232 | 1 |
| 233 | 1 |
| 234 | 1 |
| 238 | 1 |
| 242 | 1 |
| 243 | 1 |
| 253 | 1 |
| 254 | 1 |
| 258 | 1 |
| 261 | 1 |
| 263 | 1 |
| 264 | 1 |
| 32 | 2 |
| 65 | 2 |
| 69 | 2 |
| 75 | 2 |
| 80 | 2 |
| 94 | 2 |
| 95 | 2 |
| 120 | 2 |
| 125 | 2 |
| 155 | 2 |
| 156 | 2 |
| 159 | 2 |
| 169 | 2 |
| 174 | 2 |
| 190 | 2 |
| 203 | 2 |
| 209 | 2 |
| 211 | 2 |
| 252 | 2 |
| 255 | 2 |
| 257 | 2 |
| 70 | 3 |
| 93 | 3 |
| 162 | 3 |
| 183 | 3 |
| 186 | 4 |
And visualize
ggplot(rois_dist, aes(x=N)) +
geom_histogram(alpha=0.5, col="white",
binwidth = 1) +
ggtitle("Distribution of Number of Connections per ROI") +
stat_bin(binwidth= 1,
geom="text",
aes(label = paste("N =", ..count..)),
vjust = -1) +
ylim(0, 100) +
theme_pander()
How well is the model doing? To investigate this, we can hand-craft a Leave-One Out regression model and save the predicted values of rate of forgetting as well as the recorded beta weights.
dfX <- data.frame(cbind(Y, X[,betas != 0]))
numP <- ncol(X[, betas != 0])
numO <- length(Y)
names(dfX) <- c("Y", paste("X", 1:numP, sep=""))
Yp <- rep(0, numO) # Vector of zeros the size of Y
Xe <- matrix(rep(0, numP * numO),
ncol = numP) # Matrix of zeros the dimensions of X
for (i in seq(1, length(Y))) {
subdfX <- dfX[-i,]
lmod<-lm(Y ~ . + 1, as.data.frame(subdfX))
yp <- predict(object=lmod,
newdata=dfX[i, 2:(numP + 1)])
Yp[i] <- yp
Xe[i,] <- lmod$coefficients[2:(numP + 1)]
}
Now, let’s do a real predicted vs. observed graph:
wcomparison <- tibble(Observed = Y,
Predicted = Yp,
DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
"Correct",
"Misclassified"))
rval <- floor(100*cor(Y, Yp))/100
p <- ggplot(wcomparison, aes(x=Predicted, y=Observed,
col=Accuracy)) +
geom_point(size=4, alpha=0.6,
position= position_jitter(height = 0.02)) +
geom_abline(intercept = 0, slope = 1,
col="red",
linetype="dashed") +
scale_color_d3() +
theme_pander() +
theme(legend.position = "right") +
guides(col=guide_legend("Classification")) +
coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
annotate("text", x=0.3, y=0.7,
label=paste("r(",
length(Y),
") = ",
rval,
", p < 0.001",
sep="")) +
ylab("Observed Strategy") +
xlab("Predicted Strategy") +
ggtitle("Decision Strategy:\nCross-Validation") +
theme(legend.position = "bottom")
ggMarginal(p,
fill="grey",
alpha=0.75,
type="density", #bins=13,
col="darkgrey",
margins = "both")
And now, ROC to get the accuracy.
wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))
rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
g <- ggroc(rocobj, col="red") +
geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
ggtitle("ROC Curve for Model") +
xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
theme_pander()
g
The final accuracy of the prediction model is 0.9290805
To assess the robustness of our threshold, we can plot a ROC curve with specifities and sensitivities for sliding values of the prediction threshold.
curve <- NULL
for (threshold in seq(0, 1, 0.01)) {
subthreshold <- wcomparison %>%
mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
group_by(Observed) %>%
summarise(Accuracy = mean(Accuracy))
tnr <- subthreshold %>%
filter(Observed == 0) %>%
dplyr::select(Accuracy) %>%
as.numeric()
tpr <- subthreshold %>%
filter(Observed == 1) %>%
dplyr::select(Accuracy) %>%
as.numeric()
partial <- tibble(Threshold = threshold,
TNR = tnr,
TPR = tpr)
if (is.null(curve)) {
curve <- partial
} else {
curve <- rbind(curve, partial)
}
}
And now, we can visualize the discrete ROC:
ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) +
geom_point(size=2, col="red", alpha=0.5) +
geom_line(col="red") +
ylab("Sensitivity (True Positive Rate)") +
xlab("Specificity (True Negative Rate)") +
scale_x_reverse() +
# ylim(0, 1) +
# xlim(1, 0) +
ggtitle("ROC Curve for Different Thresholds") +
geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
theme_pander()
And now, let’s visualize the beta weights of the connections
colnames(Xe) <- paste("Connection", 1:nrow(connectome), paste="")
wconnections <- as_tibble(Xe)
lconnections <- pivot_longer(wconnections, cols=colnames(Xe),
names_to="Connection", values_to = "Beta")
connectome <- connectome %>% arrange(Beta)
ggplot(lconnections, aes(x = reorder(Connection, Beta), y = Beta)) +
geom_point(aes(col=Beta), alpha=.5,
size=2,
position = position_jitter(height = 0, width = 0.3)) +
stat_summary(fun.data = "mean_sdl", geom="point", fill="black", alpha=1, size=1) +
scale_color_gradient2(low = "dodgerblue",
mid = "wheat",
high = "red2",
midpoint = 0) +
scale_x_discrete(labels =
paste(connectome$network_names,
" (",
connectome$connection,
")", sep="")) +
ggtitle("Connection Weights\nAcross Cross-Validation") +
ylab(expression(paste(beta, " value"))) +
xlab("Connection") +
geom_hline(yintercept = 0, col="grey") +
stat_summary(fun.data = "mean_cl_boot",
col="black", geom="errorbar", width=1) +
#scale_color_viridis(option="plasma", begin=0.2, end=0.9) +
theme_pander() +
theme(axis.text.y = element_text(angle=0, hjust=1),
legend.position = "NA") +
#ylim(-3, 3) +
coord_flip()
Here, we will examine the quality of our Lasso model bu doing a series of tests.
In the ablation test, we remove all the connections with significant beta values, and check whether the results are still significant.
XX <- X[, conn_betas$Beta == 0]
fit_wo <- glmnet(y = Y,
x = XX,
alpha=1,
lambda = fit$lambda,
family = "binomial",
type.measure = "class",
weights = W,
standardize = T
)
fit_wo.cv <- cv.glmnet(y = Y,
x = XX,
alpha=1,
weights = W,
lambda = fit$lambda,
standardize=T,
type.measure = "class",
family = "binomial",
grouped=F,
nfolds=length(Y)
)
The model does converge, but its overall classification error is much higher.
plot(fit_wo, sub="Beta Values for Connectivity")
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
L1norm <- sum(abs(fit_wo$beta[,which(fit_wo$lambda==fit_wo.cv$lambda.1se)]))
abline(v=L1norm, lwd=2, lty=2)
It is useful to plot the two \(\lambda\)-curves (with and without the relevant connections) on the same plot.
lasso_df_wo <- tibble(lambda=fit_wo.cv$lambda,
error=fit_wo.cv$cvm,
sd=fit_wo.cv$cvsd)
lasso_df$Model <- "Full Model"
lasso_df_wo$Model <- "Without the Selected Connections"
lasso_uber <- rbind(lasso_df, lasso_df_wo)
ggplot(lasso_uber, aes(x = lambda, y = error, fill=Model)) +
scale_color_d3() +
scale_fill_d3()+
geom_ribbon(aes(ymin = error - sd,
ymax = error + sd),
alpha = 0.5,
#fill="blue"
) +
geom_line(aes(col=Model), lwd=2) +
xlab(expression(lambda)) +
ylab("Cross-Validation Error") +
ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
geom_vline(xintercept = fit.cv$lambda.1se,
linetype="dashed") +
theme_pander() +
theme(legend.position="bottom")
Then, we examine the Variance Inflation Factor (VIF). To calculate the VIF, we need to first create a linear model of the factor effects:
dfX <- data.frame(cbind(Y, X[,betas != 0]))
mod<-lm(Y ~ . + 1, as.data.frame(dfX))
We can now calculate the VIF and turn the results into a tibble:
vifs <- vif(mod)
vifsT <- tibble(VIF = vifs)
And, finally, we can plot an histogram of the distribution of VIF values. VIFs values < 10 are considered non-collinear; VIFs values < 5 are great. All of our factors have VIF values that a re much smaller than 5, which implies that they are as close to a normal basis set as possible.
ggplot(vifsT, aes( x =VIF)) +
geom_histogram(col="white", binwidth = 0.1, fill="blue", alpha=0.4) +
theme_pander() +
xlab("VIF Value") +
ylab("Number of Predictors") +
ggtitle("Distribution of Variance Inflation Factors")
Standard LOOV cross-validation suffers from data leakage: Since the same data is use for both validation (hyper-parameter fitting, in this case, the \(\lambda\) parameter) and testing, success on testing is biased by the fact that validation was performed on the same sample.
The canonical way to handle this problem is to have separate train-validate-test groups. This, however, runs into the problem of dividing the data properly and having a reduced sample.
An alternative solutio is to do nested cross-validation. In this case, the data is analyzed with two LOOV loops: at every turn, one subject is left out as a test subject, and the remaining N-1 are used for training and validation. This group is then again splitting one participant out for valiation and the remaining N-2 for training. The inner loop is performed until the validation is complete, at which point we use the hyper-parameters that guaratee best generalization to test the model on the the original, left-out participant. The process is repeated for every participant.
Note that different model (an a different profile of \(\lambda\)) is generated for every participat. To obtaine the bestmodel, we select first select the \(\lambda_{1SE}\) value. If there are no regressors left (i.e., \(N(\beta \neq 0) = 0\)), we select \(\lambda_{min}\). If we still have no regressors, we select the largest value of \(\lambda\) for which at least one regressor is left \(\neq0\).
N <- length(Y)
P <- ncol(X)
betas <- matrix(rep(0, P * N), nrow = N)
Yp <- rep(0, N)
minF = 1
X <- atanh(X)
#dfX <- as.data.frame(cbind(Y, X))
for (i in 1:N) {
Ytrain <- Y[-i]
Xtrain <- X[-i,]
Wtrain <- W[-i]
# fit <- ncvreg
fit <- glmnet(y = Ytrain,
x = Xtrain,
weights = Wtrain,
alpha = 1,
family = "binomial",
type.measure ="class",
standardize = T
)
fit.cv <- cv.glmnet(y = Ytrain,
x = Xtrain,
alpha=1,
weights=Wtrain,
#penalty="SCAD",
family = "binomial",
type.measure = "class",
standardize=T,
grouped=F,
#nfolds=5
nfolds=length(Ytrain)
)
lambda <- fit.cv$lambda.min
nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.min)]
if (fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.1se)] > 0) {
lambda <- fit.cv$lambda.1se
nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.1se)]
}
if (nzero < minF) {
# If no features, select a less-generalizable lambda
lambda <- fit.cv$lambda[which(fit.cv$nzero >= minF)[1]]
}
B <- fit$beta[,which(fit$lambda==lambda)]
#B <- fit$beta[,which(fit$lambda==fit$lambda[60])]
#print(B)
#print(fit.cv$lambda.min)
#plot(fit.cv)
if (length(B[B!=0])) {
#print(c(i, length(B[B!=0])))
dfX <- data.frame(cbind(Y, X[, B != 0]))
lmod<-lm(Y ~ . + 1, data=dfX[-i,])
#print(lmod)
#Xtest <- dfX[i,-1]
yp <- lmod$coefficients[1] + sum(B*X[i,])
} else {
yp <- mean(Ytrain)
}
betas[i,] <- B
Yp[i] <- yp
}
One of the problems of nested CV is that a different model is tested for each participant. To get a sense of the most predictive regressors, we can simply average across them:
uB <- colMeans(betas)
conn_betas <- as_tibble(data.frame(index=I$index, Beta=uB))
connectome_nested <- order %>%
filter(index %in% I$index) %>%
inner_join(conn_betas) %>%
dplyr::select(-censor2) %>%
filter(Beta != 0) %>%
arrange(index)
## Joining, by = "index"
write_csv(connectome_nested, file="strategy_mr_nestd.csv")
Now, let’s do a real predicted vs. observed graph:
wcomparison <- tibble(Observed = Y,
Predicted = Yp,
DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
"Correct",
"Misclassified"))
rval <- floor(100*cor(Y, Yp))/100
p <- ggplot(wcomparison, aes(x=Predicted, y=Observed,
col=Accuracy)) +
geom_point(size=4, alpha=0.6,
position= position_jitter(height = 0.02)) +
geom_abline(intercept = 0, slope = 1,
col="red",
linetype="dashed") +
scale_color_d3() +
theme_pander() +
theme(legend.position = "right") +
guides(col=guide_legend("Classification")) +
coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
annotate("text", x=0.3, y=0.7,
label=paste("r(",
length(Y),
") = ",
rval,
", p < 0.001",
sep="")) +
ylab("Observed Strategy") +
xlab("Predicted Strategy") +
ggtitle("Decision Strategy:\nCross-Validation") +
theme(legend.position = "bottom")
ggMarginal(p,
fill="grey",
alpha=0.75,
type="density", #bins=13,
col="darkgrey",
margins = "both")
And now, ROC to get the accuracy.
wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))
rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
g <- ggroc(rocobj, col="red") +
geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
ggtitle("ROC Curve for Model") +
xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
theme_pander()
g
The final accuracy of the prediction model is 0.8312188
To assess the robustness of our threshold, we can plot a ROC curve with specifities and sensitivities for sliding values of the prediction threshold.
curve <- NULL
for (threshold in seq(0, 1, 0.01)) {
subthreshold <- wcomparison %>%
mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
group_by(Observed) %>%
summarise(Accuracy = mean(Accuracy))
tnr <- subthreshold %>%
filter(Observed == 0) %>%
dplyr::select(Accuracy) %>%
as.numeric()
tpr <- subthreshold %>%
filter(Observed == 1) %>%
dplyr::select(Accuracy) %>%
as.numeric()
partial <- tibble(Threshold = threshold,
TNR = tnr,
TPR = tpr)
if (is.null(curve)) {
curve <- partial
} else {
curve <- rbind(curve, partial)
}
}
And now, we can visualize the discrete ROC:
ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) +
geom_point(size=2, col="red", alpha=0.5) +
geom_line(col="red") +
ylab("Sensitivity (True Positive Rate)") +
xlab("Specificity (True Negative Rate)") +
scale_x_reverse() +
# ylim(0, 1) +
# xlim(1, 0) +
ggtitle("ROC Curve for Different Thresholds") +
geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
theme_pander()
We can look at whether there are difference in connectivity between individuals vs. procedural individuals using quantitative methods
# skip this block
connectome %>%
#filter(connection_type != 'Within') %>%
mutate(Group = if_else(Beta >= 0, 'Declarative', 'Procedural')) %>%
group_by(network_names, connection_type, Group) %>%
summarise(n= n()) %>%
mutate(freq = n / sum(n)) %>%
#arrange(desc(freq, Group, network_names)) %>%
ggplot(aes(x=network_names, y=freq)) +
geom_bar(aes(fill = Group), stat = "identity", width=.4, position = position_stack()) +
facet_grid(network_names~.) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# only positive betas (declarative individuals)
# using 95% confidence interval
lowerBeta <- t.test(connectome$Beta, conf.level = .95)$conf.int[1]
upperBeta <- t.test(connectome$Beta, conf.level = .95)$conf.int[2]
# using 0
lowerBeta <- 0
upperBeta <- 0
connectome_beta_positive <- connectome %>% subset(Beta>upperBeta)
connectome_beta_negative <- connectome %>% subset(Beta<lowerBeta)
lROIs_beta_positive <- unique(c(vregion_from(connectome_beta_positive$connection),
vregion_to(connectome_beta_positive$connection)))
rois_beta_positive <- power2011[lROIs_beta_positive, ]
roi_stats_beta_positive <- rois_beta_positive %>%
group_by(NetworkName, Color) %>%
summarise(N=length(Color)/length(lROIs_beta_positive)) %>%
# keep all network in
right_join(power2011 %>% distinct(NetworkName, Color)) %>%
add_column(Source="beta_positive") %>%
mutate(N = replace_na(N, 0)) %>%
arrange(NetworkName)
## `summarise()` has grouped output by 'NetworkName'. You can override using the `.groups` argument.
## Joining, by = c("NetworkName", "Color")
#power2011 %>%
# mutate(beta_positive = ROI %in% lROIs_beta_positive) %>%
# group_by(NetworkName, Color, beta_positive) %>%
# summarise(N=length(Color)/length(lROIs_beta_positive)) %>%
# add_column(Source="beta_positive") %>%
# filter(beta_positive==T)
lROIs_beta_negative <- unique(c(vregion_from(connectome_beta_negative$connection),
vregion_to(connectome_beta_negative$connection)))
rois_beta_negative <- power2011[lROIs_beta_negative, ]
roi_stats_beta_negative <- rois_beta_negative %>%
group_by(NetworkName, Color) %>%
summarise(N=length(Color)/length(lROIs_beta_negative)) %>%
# keep all network in
right_join(power2011 %>% distinct(NetworkName, Color)) %>%
add_column(Source="beta_negative") %>%
mutate(N = replace_na(N, 0)) %>%
arrange(NetworkName)
## `summarise()` has grouped output by 'NetworkName'. You can override using the `.groups` argument.
## Joining, by = c("NetworkName", "Color")
noi_stats %>%
rbind(roi_stats_beta_positive) %>%
rbind(roi_stats_beta_negative) %>%
ggplot(aes(x="", y=N, fill=NetworkName)) +
geom_bar(stat = "identity", col="white", width=1) +
facet_grid(~Source, labeller = label_both) +
coord_polar("y", start=0) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
#scale_fill_viridis(discrete = T) +
scale_fill_ucscgb() +
ylab("") +
xlab("") +
ggtitle("Distriution of Regions") +
#geom_text_repel(aes(label=percent(N, .01)), col="black",
# position=position_stack(vjust=1), size=3)+
theme_pander() +
guides(fill=guide_legend(ncol=2)) +
theme(legend.position = "bottom")
Looking at positive beta networks vs. negative beta network, there is no difference in ROI distribution
chisq.test(roi_stats_beta_positive$N*length(lROIs_beta_positive), p = noi_stats$N)
## Warning in chisq.test(roi_stats_beta_positive$N * length(lROIs_beta_positive), :
## Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: roi_stats_beta_positive$N * length(lROIs_beta_positive)
## X-squared = 19.537, df = 13, p-value = 0.1074
chisq.test(roi_stats_beta_negative$N*length(lROIs_beta_negative), p = noi_stats$N)
## Warning in chisq.test(roi_stats_beta_negative$N * length(lROIs_beta_negative), :
## Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: roi_stats_beta_negative$N * length(lROIs_beta_negative)
## X-squared = 18.314, df = 13, p-value = 0.146
chisq.test(x = roi_stats_beta_positive$N*length(lROIs_beta_positive),
y = roi_stats_beta_negative$N*length(lROIs_beta_negative))
## Warning in chisq.test(x = roi_stats_beta_positive$N *
## length(lROIs_beta_positive), : Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: roi_stats_beta_positive$N * length(lROIs_beta_positive) and roi_stats_beta_negative$N * length(lROIs_beta_negative)
## X-squared = 53.667, df = 54, p-value = 0.4872
Next, look at the between/within distribution
connectome_stats_beta_positive <- connectome_beta_positive %>%
group_by(connection_type) %>%
summarise(N=length(index), P=length(index)/dim(connectome_beta_positive)[1]) %>%
add_column(Source = "beta_positive")
connectome_stats_beta_negative <- connectome_beta_negative %>%
group_by(connection_type) %>%
summarise(N=length(index), P=length(index)/dim(connectome_beta_negative)[1]) %>%
add_column(Source = "beta_negative")
coi_stats %>%
rbind(connectome_stats_beta_positive) %>%
rbind(connectome_stats_beta_negative) %>%
ggplot(aes(x="", y=P, fill=connection_type)) +
geom_bar(stat = "identity", col="grey", width=1) +
facet_grid(~Source, labeller = label_both) +
coord_polar("y", start=0) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
scale_fill_jama() +
scale_color_jama() +
ylab("") +
xlab("") +
ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
geom_text_repel(aes(label=percent(P, .1)), col="white",
position=position_stack(vjust=1), size=3)+
theme_pander() +
theme(legend.position = "bottom")
The distribution is different from Power 2011, but there is no difference between declarative vs. procedural network
chisq.test(connectome_stats_beta_positive$N, p=coi_stats$P)
## Warning in chisq.test(connectome_stats_beta_positive$N, p = coi_stats$P): Chi-
## squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: connectome_stats_beta_positive$N
## X-squared = 55.863, df = 1, p-value = 7.769e-14
chisq.test(connectome_stats_beta_negative$N, p=coi_stats$P)
## Warning in chisq.test(connectome_stats_beta_negative$N, p = coi_stats$P): Chi-
## squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: connectome_stats_beta_negative$N
## X-squared = 97.543, df = 1, p-value < 2.2e-16
chisq.test(x = connectome_stats_beta_positive$N, p = connectome_stats_beta_negative$P)
##
## Chi-squared test for given probabilities
##
## data: connectome_stats_beta_positive$N
## X-squared = 0.50134, df = 1, p-value = 0.4789
chisq.test(p = connectome_stats_beta_positive$P, x = connectome_stats_beta_negative$N)
##
## Chi-squared test for given probabilities
##
## data: connectome_stats_beta_negative$N
## X-squared = 0.64103, df = 1, p-value = 0.4233